Group Project - July 23, 2018

This is a course project in MS Business Analytics, seeking to determine which factors and to what degree each factor is driving employee attrition. We get the data from IBM Data Science Team Kaggle.com - HR Analytics

Setups:

library(ggplot2)
library(caret)
## Loading required package: lattice
library(MASS)
library(tree)
library(knitr)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(gbm)
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
library(corrplot)
## corrplot 0.84 loaded
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
library(e1071)


1. Data Cleaning

Overview

# Overview: 1470 obs. of  35 variables
str(attribution)
## 'data.frame':    1470 obs. of  35 variables:
##  $ Age                     : int  41 49 37 33 27 32 59 30 38 36 ...
##  $ Attrition               : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
##  $ DailyRate               : int  1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
##  $ DistanceFromHome        : int  1 8 2 3 2 2 3 24 23 27 ...
##  $ Education               : int  2 1 2 4 1 2 3 1 3 3 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  1 2 4 5 7 8 10 11 12 13 ...
##  $ EnvironmentSatisfaction : int  2 3 4 4 1 4 3 4 4 3 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
##  $ HourlyRate              : int  94 61 92 56 40 79 81 67 44 94 ...
##  $ JobInvolvement          : int  3 2 2 3 3 3 4 3 2 3 ...
##  $ JobLevel                : int  2 2 1 1 1 1 1 1 3 2 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
##  $ JobSatisfaction         : int  4 2 3 3 2 4 1 3 3 3 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
##  $ MonthlyIncome           : int  5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
##  $ MonthlyRate             : int  19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
##  $ NumCompaniesWorked      : int  8 1 6 1 9 0 4 1 0 6 ...
##  $ Over18                  : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
##  $ PercentSalaryHike       : int  11 23 15 11 12 13 20 22 21 13 ...
##  $ PerformanceRating       : int  3 4 3 3 3 3 4 4 4 3 ...
##  $ RelationshipSatisfaction: int  1 4 2 3 4 3 1 2 2 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  0 1 0 0 1 0 3 1 0 2 ...
##  $ TotalWorkingYears       : int  8 10 7 8 6 8 12 1 10 17 ...
##  $ TrainingTimesLastYear   : int  0 3 3 3 3 2 3 2 2 3 ...
##  $ WorkLifeBalance         : int  1 3 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  6 10 0 8 2 7 1 1 9 7 ...
##  $ YearsInCurrentRole      : int  4 7 0 7 2 7 0 0 7 7 ...
##  $ YearsSinceLastPromotion : int  0 1 0 3 2 3 0 0 1 7 ...
##  $ YearsWithCurrManager    : int  5 7 0 0 2 6 0 0 8 7 ...

Change Strings to multi-level factors:

attribution$Attrition = as.factor(attribution$Attrition)
attribution$BusinessTravel = as.factor(attribution$BusinessTravel)
attribution$Department = as.factor(attribution$Department)
attribution$EducationField = as.factor(attribution$EducationField)
attribution$Gender = as.factor(attribution$Gender)
attribution$JobRole = as.factor(attribution$JobRole)
attribution$MaritalStatus = as.factor(attribution$MaritalStatus)
attribution$OverTime = as.factor(attribution$OverTime)
#na?
any(is.na(attribution))
## [1] FALSE

Remove Columns: Over18, EmployeeCount, EmployeeNumber, standardhour, which are not relevant to Attrition.

#na?
attribution = attribution[ , c(-9,-10,-22, -27)]

2. EDA: Correlation

numerical = unlist(lapply(attribution, is.numeric))
M = cor(attribution[, numerical])
corrplot.mixed(M, tl.cex=0.6)

As a result:
JobLevel & MonthlyIncome: 0.95
JobLevel & WorkingYears: 0.78
PercentSalaryHike & Performance Rating: 0.77
MonthlyIncome & Working Years: 0.77
YearsatCompany & YearsCurrManager: 0.77
YearsatCompany & YearInCurrentRole: 0.76
YearInCurrentRole & YearsCurrManager: 0.71
Age & WorkingYears: 0.68
WorkingYears & YearsatCompany: 0.63
YearsatCompany & YearssinceLastPromotion: 0.62

3. EDA: Visualization of each independent variable against Attrition

ggplot(attribution, aes(x=Age, fill=Attrition)) + geom_density(alpha=0.3)

# Those who travel frequently are more likely to leave
g = ggplot(attribution, aes(x = Attrition, group = BusinessTravel)) + geom_bar(aes(y = ..prop..), stat="count", fill="#FFA500", alpha=0.3)
g +facet_grid(.~BusinessTravel) + ggtitle("BusinessTravel")+theme_bw()+geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), stat= "count", vjust = -.3)

# attrition under dailyRate: people with lower dailyrate tend to leave
ggplot(attribution, aes(x=DailyRate, fill=Attrition)) + geom_density(alpha=0.3)

# People in Sales are more likely to leave
g = ggplot(attribution, aes(x =Attrition , group = Department))+geom_bar(aes(y = ..prop..), stat="count", fill = "#FFA500", alpha=0.3)
g +facet_grid(.~Department) + ggtitle("Department")+theme_bw() + geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), stat= "count", vjust = -.3)

# People live far way tend to leave
ggplot(attribution, aes(x=DistanceFromHome, fill=Attrition)) + geom_density(alpha=0.3)

# Higher the education level,more likely to stay
g = ggplot(attribution, aes(x = Attrition, group = Education)) + geom_bar(aes(y = ..prop..), stat="count", fill="#FFA500", alpha=0.3)
g + facet_grid(.~Education) + ggtitle("Education")+theme_bw()+geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), stat= "count", vjust = -.3)

# Those with HR, Marketing and Technical education background are more likely to leave 
g = ggplot(attribution, aes(x = Attrition, group = EducationField)) + geom_bar(aes(y = ..prop..), stat="count", fill="#FFA500", alpha=0.3)
g + facet_grid(.~EducationField) + ggtitle("EducationField")+theme_bw()+geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), stat= "count", vjust = -.3)

# Environment satisfaction is important
g = ggplot(attribution, aes(x = Attrition, group = EnvironmentSatisfaction)) + geom_bar(aes(y = ..prop..), stat="count", fill="#FFA500", alpha=0.3)
g + facet_grid(.~EnvironmentSatisfaction) + ggtitle("EnvironmentSatisfaction")+theme_bw()+geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), stat= "count", vjust = -.3)

# Males are more likely to leave
g = ggplot(attribution, aes(x = Attrition, group = Gender)) + geom_bar(aes(y = ..prop..), stat="count", fill="#FFA500", alpha=0.3)
g + facet_grid(.~Gender) + ggtitle("Gender")+theme_bw()+geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), stat= "count", vjust = -.3)

# Hourly Rate: lower rate made people leave
ggplot(attribution, aes(x=HourlyRate, fill=Attrition)) + geom_density(alpha=0.3)

# Job involvement matters(a lot)
g = ggplot(attribution, aes(x = Attrition, group = JobInvolvement)) + geom_bar(aes(y = ..prop..), stat="count", fill="#FFA500", alpha=0.3)
g + facet_grid(.~JobInvolvement) + ggtitle("JobInvolvement")+theme_bw()+geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), stat= "count", vjust = -.3)

# Job Level matters
g = ggplot(attribution, aes(x = Attrition, group = JobLevel)) + geom_bar(aes(y = ..prop..), stat="count", fill="#FFA500", alpha=0.3)
g + facet_grid(.~JobLevel) + ggtitle("JobLevel")+theme_bw()+geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), stat= "count", vjust = -.3)

# Job role matters(HR, laboratory technician and sales representatives are more likely to leave)
g = ggplot(attribution, aes(x = Attrition, group = JobRole)) + geom_bar(aes(y = ..prop..), stat="count", fill="#FFA500", alpha=0.3)
g + facet_grid(.~JobRole) + ggtitle("JobRole")+theme_bw()+geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), stat= "count", hjust=0.3,vjust = -.3)

# Job satisfaction matters(a lot)
g = ggplot(attribution, aes(x = Attrition, group = JobSatisfaction)) + geom_bar(aes(y = ..prop..), stat="count", fill="#FFA500", alpha=0.3)
g + facet_grid(.~JobSatisfaction) + ggtitle("JobSatisfaction")+theme_bw()+geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), stat= "count", vjust = -.3)

# Single people tend to leave
g = ggplot(attribution, aes(x = Attrition, group = MaritalStatus)) + geom_bar(aes(y = ..prop..), stat="count", fill="#FFA500", alpha=0.3)
g + facet_grid(.~MaritalStatus) + ggtitle("MaritalStatus")+theme_bw()+geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), stat= "count", vjust = -.3)

# Monthly income Yes
ggplot(attribution, aes(x=MonthlyIncome, fill=Attrition)) + geom_density(alpha=0.3)

# Monthly Rate : not clear
ggplot(attribution, aes(x=MonthlyRate, fill=Attrition)) + geom_density(alpha=0.3)

# Num of company worked: Yes
ggplot(attribution, aes(x=NumCompaniesWorked, fill=Attrition)) + geom_density(alpha=0.3)

# Those who work overtime are more likely to leave
g = ggplot(attribution, aes(x =Attrition , group = OverTime)) + geom_bar(aes(y = ..prop..), stat="count", fill="#FFA500", alpha=0.3)
g + facet_grid(.~OverTime) + ggtitle("OverTime")+theme_bw()+geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), stat= "count", vjust = -.3)

# Percent Salary Hike: yes
ggplot(attribution, aes(x=PercentSalaryHike, fill=Attrition)) + geom_density(alpha=0.3)

# Performance Rating: no
ggplot(attribution, aes(x=PerformanceRating, fill=Attrition)) + geom_density(alpha=0.3)

ggplot(attribution, aes(x=StockOptionLevel, fill=Attrition)) + geom_density(alpha=0.3)

ggplot(attribution, aes(x=TotalWorkingYears, fill=Attrition)) + geom_density(alpha=0.3)

# TrainingTimesLastYear: no
ggplot(attribution, aes(x=TrainingTimesLastYear, fill=Attrition)) + geom_density(alpha=0.3)

# Worklifebalance(higher worklifebalance score-->stay, but the highest score4 is an exception)
g = ggplot(attribution, aes(x = Attrition, group = WorkLifeBalance)) + geom_bar(aes(y = ..prop..), stat="count", fill="#FFA500", alpha=0.3)
g + facet_grid(.~WorkLifeBalance) + ggtitle("WorkLifeBalance")+theme_bw()+geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), stat= "count", vjust = -.3)

# Years at Company matters
ggplot(attribution, aes(x=YearsAtCompany, fill=Attrition)) + geom_density(alpha=0.3)

ggplot(attribution, aes(x=YearsInCurrentRole, fill=Attrition)) + geom_density(alpha=0.3)

# YearSinceLastPromotion: no
ggplot(attribution, aes(x=YearsSinceLastPromotion, fill=Attrition)) + geom_density(alpha=0.3)

# YearswithCurrent manager matters
ggplot(attribution, aes(x=YearsWithCurrManager, fill=Attrition)) + geom_density(alpha=0.3)

4. Modeling

We can consider logistic regression, boosting, random forest, knn.’

As the data nature is highly unbalanced: most of them have attrition “No”, we decided to add a threshold for the models. For each model, we tested different threshold and pick the one that makes the best prediction.

We use accuracy, percentage of people who leaves / people we predict to leave, and percentage of people leave / people we predict to stay to compare the models because our emphasis on predicting people who intend to leave the company.

Performance Measurement Criteria
calc_acc = function(actual, predicted) {
mean(actual == predicted)
}
# how many people actually leaves when we predicted them to leave
calc_stay = function(TP, FP){
TP / (TP + FP)
}
calc_leave = function(y, x){
x / (x + y)
}

We use 70% of our dataset as traning data.

set.seed(432)
index = sample(nrow(attribution), size = trunc(0.7 * nrow(attribution)))
32
## [1] 32
train_data= attribution[index, ]
test_data = attribution[-index, ]
Logistic

First, We start with Logistic Regression and we used mixed selection to get 18 important variables. From the model summary, we can see that each variable is significant.

Modeling and Feature Selection using step-wise:

set.seed(432)
null = glm(Attrition~1, data=train_data, family="binomial")
log.fit =glm(Attrition~., data=train_data, family="binomial")
regboth = step(null, scope=formula(log.fit), direction="both", trace=0)
log.fit = glm(Attrition ~ OverTime + JobRole + JobInvolvement +
MaritalStatus + JobSatisfaction + EnvironmentSatisfaction +
BusinessTravel + DistanceFromHome + YearsInCurrentRole +
YearsSinceLastPromotion + TrainingTimesLastYear + Age +
NumCompaniesWorked + RelationshipSatisfaction + WorkLifeBalance +
YearsWithCurrManager + YearsAtCompany + TotalWorkingYears, data = train_data, family="binomial")
summary(log.fit)
## 
## Call:
## glm(formula = Attrition ~ OverTime + JobRole + JobInvolvement + 
##     MaritalStatus + JobSatisfaction + EnvironmentSatisfaction + 
##     BusinessTravel + DistanceFromHome + YearsInCurrentRole + 
##     YearsSinceLastPromotion + TrainingTimesLastYear + Age + NumCompaniesWorked + 
##     RelationshipSatisfaction + WorkLifeBalance + YearsWithCurrManager + 
##     YearsAtCompany + TotalWorkingYears, family = "binomial", 
##     data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8729  -0.4979  -0.2467  -0.0899   3.2222  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      0.99979    1.12183   0.891 0.372817    
## OverTimeYes                      2.10223    0.23751   8.851  < 2e-16 ***
## JobRoleHuman Resources           1.57058    0.66705   2.355 0.018546 *  
## JobRoleLaboratory Technician     1.41633    0.49442   2.865 0.004175 ** 
## JobRoleManager                   0.48063    0.75034   0.641 0.521816    
## JobRoleManufacturing Director   -0.16765    0.63244  -0.265 0.790938    
## JobRoleResearch Director        -1.59394    1.21943  -1.307 0.191172    
## JobRoleResearch Scientist        0.28258    0.49757   0.568 0.570093    
## JobRoleSales Executive           1.11301    0.47592   2.339 0.019353 *  
## JobRoleSales Representative      1.92596    0.57585   3.345 0.000824 ***
## JobInvolvement                  -0.61042    0.14931  -4.088 4.35e-05 ***
## MaritalStatusMarried             0.47576    0.31400   1.515 0.129737    
## MaritalStatusSingle              1.53499    0.32505   4.722 2.33e-06 ***
## JobSatisfaction                 -0.35508    0.09710  -3.657 0.000255 ***
## EnvironmentSatisfaction         -0.39092    0.09802  -3.988 6.66e-05 ***
## BusinessTravelTravel_Frequently  2.00391    0.54463   3.679 0.000234 ***
## BusinessTravelTravel_Rarely      1.19438    0.50598   2.361 0.018249 *  
## DistanceFromHome                 0.05322    0.01290   4.126 3.69e-05 ***
## YearsInCurrentRole              -0.13467    0.05571  -2.417 0.015642 *  
## YearsSinceLastPromotion          0.15101    0.05021   3.007 0.002636 ** 
## TrainingTimesLastYear           -0.18098    0.08631  -2.097 0.036006 *  
## Age                             -0.02984    0.01589  -1.878 0.060378 .  
## NumCompaniesWorked               0.18635    0.04520   4.123 3.74e-05 ***
## RelationshipSatisfaction        -0.31144    0.09807  -3.176 0.001494 ** 
## WorkLifeBalance                 -0.24699    0.15000  -1.647 0.099639 .  
## YearsWithCurrManager            -0.12106    0.05813  -2.083 0.037284 *  
## YearsAtCompany                   0.06879    0.04816   1.428 0.153218    
## TotalWorkingYears               -0.06762    0.03285  -2.059 0.039540 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 896.03  on 1028  degrees of freedom
## Residual deviance: 594.20  on 1001  degrees of freedom
## AIC: 650.2
## 
## Number of Fisher Scoring iterations: 7

After testing, a threshold of 0.7 maximize out-of-sample accuracy. Threshold setting:

log_pred = ifelse(predict(log.fit, newdata = test_data, type = "response") >= 0.7, 'Yes', 'No')
t1 = table(predicted = log_pred, actual = test_data$Attrition)
t1
##          actual
## predicted  No Yes
##       No  364  55
##       Yes   2  20
KNN

In Knn, we use the features above and apply 10-fold validation to tune the parameter k of knn.

library(class) 
library(kknn) 
## 
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
## 
##     contr.dummy
df_non_dummy<- attribution[,c(-3,-5,-8,-10,-14,-16,-20)]
x<- scale(df_non_dummy[,-2])
dfready<- data.frame(x)
dfready[,24]<- df_non_dummy[,2]
colnames(dfready)[24]<- "Attrition"
train <- sample(1:1470,0.8*1470)
a = 1:1470
test = a[-train]
nearest5 <- kknn(Attrition~., dfready[train,],dfready[-train,], k=5)


fit<- function(kknnprob, thres = 0.5){
  ft<- factor(c("Yes","No"))
  for(i in 1:nrow(kknnprob)){
    if(kknnprob[,2][i]>thres){
      ft[i] = "Yes"
    }
    else{
      ft[i]= "No"
    }
  }
  return(ft)
}

dfready<-data.frame(c(dfready, attribution[,c(3,5,8,10,14,16,20)]))
#model 
"Attrition ~ Age + OverTime + MaritalStatus + JobRole + BusinessTravel + 
    NumCompaniesWorked + JobInvolvement + JobSatisfaction + DistanceFromHome + 
YearsInCurrentRole + YearsSinceLastPromotion + TotalWorkingYears + 
RelationshipSatisfaction + DailyRate + TrainingTimesLastYear + 
StockOptionLevel"
## [1] "Attrition ~ Age + OverTime + MaritalStatus + JobRole + BusinessTravel + \n    NumCompaniesWorked + JobInvolvement + JobSatisfaction + DistanceFromHome + \nYearsInCurrentRole + YearsSinceLastPromotion + TotalWorkingYears + \nRelationshipSatisfaction + DailyRate + TrainingTimesLastYear + \nStockOptionLevel"
n = nrow(dfready)
kcv = 10
n0 = round(n/kcv,0)

out_MSE = matrix(0,kcv,100)

used = NULL
set = 1:n

for(j in 1:kcv){
  
  if(n0<length(set)){val = sample(set,n0)}
  
  if(n0>=length(set)){val=set}
  
  train_i = dfready[-val,]
  test_i = dfready[val,]
  
  
  for(i in 11:110){
    
    near = kknn(Attrition ~ Age
                +                     NumCompaniesWorked + JobInvolvement + JobSatisfaction + DistanceFromHome + 
                  +                     YearsInCurrentRole + YearsSinceLastPromotion + TotalWorkingYears + 
                  +                     RelationshipSatisfaction + DailyRate + TrainingTimesLastYear + 
                  +                     StockOptionLevel + BusinessTravel + Department + EducationField + Gender+JobRole + MaritalStatus + OverTime ,train_i,test_i,k=i,kernel = "rectangular")
    localCount = 0
    ft<- fit(near$prob, 0.27)
    for(g in 1:nrow(test_i)){
      if(test_i[g,"Attrition"]!=ft[g]){
        localCount = localCount+1
      }
    }
    
    out_MSE[j,i-10] = localCount
  }
  
  used = union(used,val)
  set = (1:n)[-used]
  
  cat(j,'\n')
  
}
## 1 
## 2 
## 3 
## 4 
## 5 
## 6 
## 7 
## 8 
## 9 
## 10
mMSE = apply(out_MSE,2,mean)
best = which.min(mMSE)
print(1-(mMSE[best]/nrow(test_i)))
## [1] 0.8639456
plot(log(1/(1:100)),sqrt(mMSE),xlab="Complexity (log(1/k))",ylab="out-of-sample RMSE",col=4,lwd=2,type="l",cex.lab=1.2,main=paste("kfold(",kcv,")"))
best = which.min(mMSE)
text(log(1/best),sqrt(mMSE[best])-0.005,paste("k=",best),col=2,cex=1)
text(log(1/11)+2.25,sqrt(mMSE[2])-0.3+0.27,paste("k=",11),col=2,cex=1)
text(log(1/110)+0.3,sqrt(mMSE[100]),paste("k=",110),col=2,cex=1)

Boosting
set.seed(432)
train_data_copy = train_data
train_data_copy$Attrition = ifelse(train_data_copy$Attrition == "No", 0, 1)
boosting_1 = gbm(Attrition~., data=train_data_copy, distribution="bernoulli", n.trees=1000, shrinkage = 0.01)
summary(boosting_1)

##                                               var    rel.inf
## OverTime                                 OverTime 17.5193953
## Age                                           Age  9.6924255
## JobRole                                   JobRole  8.8902645
## TotalWorkingYears               TotalWorkingYears  8.8817874
## MonthlyIncome                       MonthlyIncome  8.2345343
## StockOptionLevel                 StockOptionLevel  6.8576548
## JobInvolvement                     JobInvolvement  4.7108351
## DistanceFromHome                 DistanceFromHome  4.4297462
## NumCompaniesWorked             NumCompaniesWorked  4.2260420
## EnvironmentSatisfaction   EnvironmentSatisfaction  4.2057388
## EducationField                     EducationField  3.2797490
## YearsAtCompany                     YearsAtCompany  2.8953101
## RelationshipSatisfaction RelationshipSatisfaction  2.8519677
## WorkLifeBalance                   WorkLifeBalance  1.8881729
## DailyRate                               DailyRate  1.7463060
## YearsWithCurrManager         YearsWithCurrManager  1.5985834
## BusinessTravel                     BusinessTravel  1.5556085
## JobSatisfaction                   JobSatisfaction  1.0646936
## MonthlyRate                           MonthlyRate  1.0113264
## MaritalStatus                       MaritalStatus  0.9823965
## TrainingTimesLastYear       TrainingTimesLastYear  0.8849912
## HourlyRate                             HourlyRate  0.8217715
## JobLevel                                 JobLevel  0.7933201
## YearsSinceLastPromotion   YearsSinceLastPromotion  0.3941980
## PercentSalaryHike               PercentSalaryHike  0.2981278
## YearsInCurrentRole             YearsInCurrentRole  0.2850534
## Department                             Department  0.0000000
## Education                               Education  0.0000000
## Gender                                     Gender  0.0000000
## PerformanceRating               PerformanceRating  0.0000000

After adjusting the shrinkage and n.minobsinnode:

set.seed(432)
boosting_1 = gbm(Attrition~.-PerformanceRating-Gender-Department, data=train_data_copy, distribution="bernoulli", n.trees=1000, shrinkage =0.01, n.minobsinnode =4)
boo_pred = ifelse(predict(boosting_1, newdata = test_data, n.trees = 1000, type="response")>0.45, 'Yes', 'No')
t6=table(predicted = boo_pred, actual = test_data$Attrition)
t6
##          actual
## predicted  No Yes
##       No  360  56
##       Yes   6  19
Random Forest

We start by fitting all the variables in the model and then take out the least important variable according to importance plots, until all existing features have positive importance.

set.seed(432)
rf_1 = randomForest(Attrition~., data = train_data, importance=TRUE)
rf_pred = ifelse(predict(rf_1, newdata = test_data, type = "prob")[ ,1] >= 0.7, 'No', 'Yes')
table(rf_pred, test_data$Attrition)
##        
## rf_pred  No Yes
##     No  342  42
##     Yes  24  33
varImpPlot(rf_1)

set.seed(432)
rf_2 = randomForest(Attrition~.-MonthlyRate, data = train_data, importance=TRUE)
rf_pred = ifelse(predict(rf_2, newdata = test_data, type = "prob")[ ,1] >= 0.7, 'No', 'Yes')
table(rf_pred, test_data$Attrition)
##        
## rf_pred  No Yes
##     No  341  42
##     Yes  25  33
varImpPlot(rf_2)

set.seed(432)
rf_3 = randomForest(Attrition~.-MonthlyRate - PerformanceRating, data = train_data, importance=TRUE)
rf_pred = ifelse(predict(rf_3, newdata = test_data, type = "prob")[ ,1] >= 0.7, 'No', 'Yes')
table(rf_pred, test_data$Attrition)
##        
## rf_pred  No Yes
##     No  336  41
##     Yes  30  34
varImpPlot(rf_3)

set.seed(432)
rf_4 = randomForest(Attrition~.-MonthlyRate - PerformanceRating - TrainingTimesLastYear, data = train_data, importance=TRUE)
rf_pred = ifelse(predict(rf_4, newdata = test_data, type = "prob")[ ,1] >= 0.7, 'No', 'Yes')
table(rf_pred, test_data$Attrition)
##        
## rf_pred  No Yes
##     No  337  41
##     Yes  29  34
varImpPlot(rf_4)

set.seed(432)
rf_5 = randomForest(Attrition~.-MonthlyRate - PerformanceRating - TrainingTimesLastYear - Gender, data = train_data, importance=TRUE)
rf_pred = ifelse(predict(rf_5, newdata = test_data, type = "prob")[ ,1] >= 0.7, 'No', 'Yes')
table(rf_pred, test_data$Attrition)
##        
## rf_pred  No Yes
##     No  338  41
##     Yes  28  34
varImpPlot(rf_5)

set.seed(432)
rf_6 = randomForest(Attrition~.-MonthlyRate - PerformanceRating - TrainingTimesLastYear - Gender - WorkLifeBalance , data = train_data, importance=TRUE)
rf_pred = ifelse(predict(rf_6, newdata = test_data, type = "prob")[ ,1] >= 0.7, 'No', 'Yes')
table(rf_pred, test_data$Attrition)
##        
## rf_pred  No Yes
##     No  337  42
##     Yes  29  33
varImpPlot(rf_6)

set.seed(432)
rf_7 = randomForest(Attrition~.-MonthlyRate - PerformanceRating - TrainingTimesLastYear - Gender - WorkLifeBalance - PercentSalaryHike, data = train_data, importance=TRUE)
rf_pred = ifelse(predict(rf_7, newdata = test_data, type = "prob")[ ,1] >= 0.7, 'No', 'Yes')
table(rf_pred, test_data$Attrition)
##        
## rf_pred  No Yes
##     No  335  45
##     Yes  31  30
varImpPlot(rf_7)

set.seed(432)
rf_8 = randomForest(Attrition~.-MonthlyRate - PerformanceRating - TrainingTimesLastYear - Gender - WorkLifeBalance - PercentSalaryHike - DailyRate, data = train_data, importance=TRUE)
rf_pred = ifelse(predict(rf_8, newdata = test_data, type = "prob")[ ,1] >= 0.6, 'No', 'Yes')
t4=table(rf_pred, test_data$Attrition)
t4
##        
## rf_pred  No Yes
##     No  353  55
##     Yes  13  20
varImpPlot(rf_8)

naivebayes
set.seed(56)
nb_1 = naiveBayes(Attrition~., data = train_data, prior=c(1233, 237)/1470)
nb_pred = ifelse(predict(nb_1, test_data, type = "raw")[ ,1]>=0.1, 'No', 'Yes')
calc_acc(nb_pred, test_data$Attrition)
## [1] 0.8503401
t7=table(predicted = nb_pred, actual = test_data$Attrition)
t7
##          actual
## predicted  No Yes
##       No  356  56
##       Yes  10  19

To Summary, 5 models were considered and tuned with feature selection and/or parameter validation. For model summary, review model summary file.